The Breast Cancer (Wisconsin) Diagnosis dataset contains the diagnosis and a set of 30 features describing the characteristics of the cell nuclei present in the digitized image of a of a fine needle aspirate (FNA) of a breast mass.
Ten real-valued features are computed for each cell nucleus:
The mean, standard error (SE) and “worst” or largest (mean of the three largest values) of these features were computed for each image, resulting in 30 features. We will analyze the features to understand the predictive value for diagnosis. We will then create models using two different algorithms and use the models to predict the diagnosis.
suppressMessages(library(ggplot2))
suppressMessages(library(GGally))
suppressMessages(library(dplyr))
suppressMessages(library(DataExplorer))
suppressMessages(library(funModeling))
wbcd <- read.csv("data.csv")
head(wbcd,10)
## id diagnosis radius_mean texture_mean perimeter_mean area_mean
## 1 842302 M 17.99 10.38 122.80 1001.0
## 2 842517 M 20.57 17.77 132.90 1326.0
## 3 84300903 M 19.69 21.25 130.00 1203.0
## 4 84348301 M 11.42 20.38 77.58 386.1
## 5 84358402 M 20.29 14.34 135.10 1297.0
## 6 843786 M 12.45 15.70 82.57 477.1
## 7 844359 M 18.25 19.98 119.60 1040.0
## 8 84458202 M 13.71 20.83 90.20 577.9
## 9 844981 M 13.00 21.82 87.50 519.8
## 10 84501001 M 12.46 24.04 83.97 475.9
## smoothness_mean compactness_mean concavity_mean concave.points_mean
## 1 0.11840 0.27760 0.30010 0.14710
## 2 0.08474 0.07864 0.08690 0.07017
## 3 0.10960 0.15990 0.19740 0.12790
## 4 0.14250 0.28390 0.24140 0.10520
## 5 0.10030 0.13280 0.19800 0.10430
## 6 0.12780 0.17000 0.15780 0.08089
## 7 0.09463 0.10900 0.11270 0.07400
## 8 0.11890 0.16450 0.09366 0.05985
## 9 0.12730 0.19320 0.18590 0.09353
## 10 0.11860 0.23960 0.22730 0.08543
## symmetry_mean fractal_dimension_mean radius_se texture_se perimeter_se
## 1 0.2419 0.07871 1.0950 0.9053 8.589
## 2 0.1812 0.05667 0.5435 0.7339 3.398
## 3 0.2069 0.05999 0.7456 0.7869 4.585
## 4 0.2597 0.09744 0.4956 1.1560 3.445
## 5 0.1809 0.05883 0.7572 0.7813 5.438
## 6 0.2087 0.07613 0.3345 0.8902 2.217
## 7 0.1794 0.05742 0.4467 0.7732 3.180
## 8 0.2196 0.07451 0.5835 1.3770 3.856
## 9 0.2350 0.07389 0.3063 1.0020 2.406
## 10 0.2030 0.08243 0.2976 1.5990 2.039
## area_se smoothness_se compactness_se concavity_se concave.points_se
## 1 153.40 0.006399 0.04904 0.05373 0.01587
## 2 74.08 0.005225 0.01308 0.01860 0.01340
## 3 94.03 0.006150 0.04006 0.03832 0.02058
## 4 27.23 0.009110 0.07458 0.05661 0.01867
## 5 94.44 0.011490 0.02461 0.05688 0.01885
## 6 27.19 0.007510 0.03345 0.03672 0.01137
## 7 53.91 0.004314 0.01382 0.02254 0.01039
## 8 50.96 0.008805 0.03029 0.02488 0.01448
## 9 24.32 0.005731 0.03502 0.03553 0.01226
## 10 23.94 0.007149 0.07217 0.07743 0.01432
## symmetry_se fractal_dimension_se radius_worst texture_worst perimeter_worst
## 1 0.03003 0.006193 25.38 17.33 184.60
## 2 0.01389 0.003532 24.99 23.41 158.80
## 3 0.02250 0.004571 23.57 25.53 152.50
## 4 0.05963 0.009208 14.91 26.50 98.87
## 5 0.01756 0.005115 22.54 16.67 152.20
## 6 0.02165 0.005082 15.47 23.75 103.40
## 7 0.01369 0.002179 22.88 27.66 153.20
## 8 0.01486 0.005412 17.06 28.14 110.60
## 9 0.02143 0.003749 15.49 30.73 106.20
## 10 0.01789 0.010080 15.09 40.68 97.65
## area_worst smoothness_worst compactness_worst concavity_worst
## 1 2019.0 0.1622 0.6656 0.7119
## 2 1956.0 0.1238 0.1866 0.2416
## 3 1709.0 0.1444 0.4245 0.4504
## 4 567.7 0.2098 0.8663 0.6869
## 5 1575.0 0.1374 0.2050 0.4000
## 6 741.6 0.1791 0.5249 0.5355
## 7 1606.0 0.1442 0.2576 0.3784
## 8 897.0 0.1654 0.3682 0.2678
## 9 739.3 0.1703 0.5401 0.5390
## 10 711.4 0.1853 1.0580 1.1050
## concave.points_worst symmetry_worst fractal_dimension_worst X
## 1 0.2654 0.4601 0.11890 NA
## 2 0.1860 0.2750 0.08902 NA
## 3 0.2430 0.3613 0.08758 NA
## 4 0.2575 0.6638 0.17300 NA
## 5 0.1625 0.2364 0.07678 NA
## 6 0.1741 0.3985 0.12440 NA
## 7 0.1932 0.3063 0.08368 NA
## 8 0.1556 0.3196 0.11510 NA
## 9 0.2060 0.4378 0.10720 NA
## 10 0.2210 0.4366 0.20750 NA
dim(wbcd)
## [1] 569 33
str(wbcd)
## 'data.frame': 569 obs. of 33 variables:
## $ id : int 842302 842517 84300903 84348301 84358402 843786 844359 84458202 844981 84501001 ...
## $ diagnosis : chr "M" "M" "M" "M" ...
## $ radius_mean : num 18 20.6 19.7 11.4 20.3 ...
## $ texture_mean : num 10.4 17.8 21.2 20.4 14.3 ...
## $ perimeter_mean : num 122.8 132.9 130 77.6 135.1 ...
## $ area_mean : num 1001 1326 1203 386 1297 ...
## $ smoothness_mean : num 0.1184 0.0847 0.1096 0.1425 0.1003 ...
## $ compactness_mean : num 0.2776 0.0786 0.1599 0.2839 0.1328 ...
## $ concavity_mean : num 0.3001 0.0869 0.1974 0.2414 0.198 ...
## $ concave.points_mean : num 0.1471 0.0702 0.1279 0.1052 0.1043 ...
## $ symmetry_mean : num 0.242 0.181 0.207 0.26 0.181 ...
## $ fractal_dimension_mean : num 0.0787 0.0567 0.06 0.0974 0.0588 ...
## $ radius_se : num 1.095 0.543 0.746 0.496 0.757 ...
## $ texture_se : num 0.905 0.734 0.787 1.156 0.781 ...
## $ perimeter_se : num 8.59 3.4 4.58 3.44 5.44 ...
## $ area_se : num 153.4 74.1 94 27.2 94.4 ...
## $ smoothness_se : num 0.0064 0.00522 0.00615 0.00911 0.01149 ...
## $ compactness_se : num 0.049 0.0131 0.0401 0.0746 0.0246 ...
## $ concavity_se : num 0.0537 0.0186 0.0383 0.0566 0.0569 ...
## $ concave.points_se : num 0.0159 0.0134 0.0206 0.0187 0.0188 ...
## $ symmetry_se : num 0.03 0.0139 0.0225 0.0596 0.0176 ...
## $ fractal_dimension_se : num 0.00619 0.00353 0.00457 0.00921 0.00511 ...
## $ radius_worst : num 25.4 25 23.6 14.9 22.5 ...
## $ texture_worst : num 17.3 23.4 25.5 26.5 16.7 ...
## $ perimeter_worst : num 184.6 158.8 152.5 98.9 152.2 ...
## $ area_worst : num 2019 1956 1709 568 1575 ...
## $ smoothness_worst : num 0.162 0.124 0.144 0.21 0.137 ...
## $ compactness_worst : num 0.666 0.187 0.424 0.866 0.205 ...
## $ concavity_worst : num 0.712 0.242 0.45 0.687 0.4 ...
## $ concave.points_worst : num 0.265 0.186 0.243 0.258 0.163 ...
## $ symmetry_worst : num 0.46 0.275 0.361 0.664 0.236 ...
## $ fractal_dimension_worst: num 0.1189 0.089 0.0876 0.173 0.0768 ...
## $ X : logi NA NA NA NA NA NA ...
plot_num(wbcd %>% select(-id), bins=10)
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
wbcd$X <- NULL
wbcd <- wbcd[,-1]
wbcd$diagnosis <- factor(ifelse(wbcd$diagnosis=="B","Benign","Malignant"))
head(wbcd,10)
## diagnosis radius_mean texture_mean perimeter_mean area_mean smoothness_mean
## 1 Malignant 17.99 10.38 122.80 1001.0 0.11840
## 2 Malignant 20.57 17.77 132.90 1326.0 0.08474
## 3 Malignant 19.69 21.25 130.00 1203.0 0.10960
## 4 Malignant 11.42 20.38 77.58 386.1 0.14250
## 5 Malignant 20.29 14.34 135.10 1297.0 0.10030
## 6 Malignant 12.45 15.70 82.57 477.1 0.12780
## 7 Malignant 18.25 19.98 119.60 1040.0 0.09463
## 8 Malignant 13.71 20.83 90.20 577.9 0.11890
## 9 Malignant 13.00 21.82 87.50 519.8 0.12730
## 10 Malignant 12.46 24.04 83.97 475.9 0.11860
## compactness_mean concavity_mean concave.points_mean symmetry_mean
## 1 0.27760 0.30010 0.14710 0.2419
## 2 0.07864 0.08690 0.07017 0.1812
## 3 0.15990 0.19740 0.12790 0.2069
## 4 0.28390 0.24140 0.10520 0.2597
## 5 0.13280 0.19800 0.10430 0.1809
## 6 0.17000 0.15780 0.08089 0.2087
## 7 0.10900 0.11270 0.07400 0.1794
## 8 0.16450 0.09366 0.05985 0.2196
## 9 0.19320 0.18590 0.09353 0.2350
## 10 0.23960 0.22730 0.08543 0.2030
## fractal_dimension_mean radius_se texture_se perimeter_se area_se
## 1 0.07871 1.0950 0.9053 8.589 153.40
## 2 0.05667 0.5435 0.7339 3.398 74.08
## 3 0.05999 0.7456 0.7869 4.585 94.03
## 4 0.09744 0.4956 1.1560 3.445 27.23
## 5 0.05883 0.7572 0.7813 5.438 94.44
## 6 0.07613 0.3345 0.8902 2.217 27.19
## 7 0.05742 0.4467 0.7732 3.180 53.91
## 8 0.07451 0.5835 1.3770 3.856 50.96
## 9 0.07389 0.3063 1.0020 2.406 24.32
## 10 0.08243 0.2976 1.5990 2.039 23.94
## smoothness_se compactness_se concavity_se concave.points_se symmetry_se
## 1 0.006399 0.04904 0.05373 0.01587 0.03003
## 2 0.005225 0.01308 0.01860 0.01340 0.01389
## 3 0.006150 0.04006 0.03832 0.02058 0.02250
## 4 0.009110 0.07458 0.05661 0.01867 0.05963
## 5 0.011490 0.02461 0.05688 0.01885 0.01756
## 6 0.007510 0.03345 0.03672 0.01137 0.02165
## 7 0.004314 0.01382 0.02254 0.01039 0.01369
## 8 0.008805 0.03029 0.02488 0.01448 0.01486
## 9 0.005731 0.03502 0.03553 0.01226 0.02143
## 10 0.007149 0.07217 0.07743 0.01432 0.01789
## fractal_dimension_se radius_worst texture_worst perimeter_worst area_worst
## 1 0.006193 25.38 17.33 184.60 2019.0
## 2 0.003532 24.99 23.41 158.80 1956.0
## 3 0.004571 23.57 25.53 152.50 1709.0
## 4 0.009208 14.91 26.50 98.87 567.7
## 5 0.005115 22.54 16.67 152.20 1575.0
## 6 0.005082 15.47 23.75 103.40 741.6
## 7 0.002179 22.88 27.66 153.20 1606.0
## 8 0.005412 17.06 28.14 110.60 897.0
## 9 0.003749 15.49 30.73 106.20 739.3
## 10 0.010080 15.09 40.68 97.65 711.4
## smoothness_worst compactness_worst concavity_worst concave.points_worst
## 1 0.1622 0.6656 0.7119 0.2654
## 2 0.1238 0.1866 0.2416 0.1860
## 3 0.1444 0.4245 0.4504 0.2430
## 4 0.2098 0.8663 0.6869 0.2575
## 5 0.1374 0.2050 0.4000 0.1625
## 6 0.1791 0.5249 0.5355 0.1741
## 7 0.1442 0.2576 0.3784 0.1932
## 8 0.1654 0.3682 0.2678 0.1556
## 9 0.1703 0.5401 0.5390 0.2060
## 10 0.1853 1.0580 1.1050 0.2210
## symmetry_worst fractal_dimension_worst
## 1 0.4601 0.11890
## 2 0.2750 0.08902
## 3 0.3613 0.08758
## 4 0.6638 0.17300
## 5 0.2364 0.07678
## 6 0.3985 0.12440
## 7 0.3063 0.08368
## 8 0.3196 0.11510
## 9 0.4378 0.10720
## 10 0.4366 0.20750
summary(wbcd)
## diagnosis radius_mean texture_mean perimeter_mean
## Benign :357 Min. : 6.981 Min. : 9.71 Min. : 43.79
## Malignant:212 1st Qu.:11.700 1st Qu.:16.17 1st Qu.: 75.17
## Median :13.370 Median :18.84 Median : 86.24
## Mean :14.127 Mean :19.29 Mean : 91.97
## 3rd Qu.:15.780 3rd Qu.:21.80 3rd Qu.:104.10
## Max. :28.110 Max. :39.28 Max. :188.50
## area_mean smoothness_mean compactness_mean concavity_mean
## Min. : 143.5 Min. :0.05263 Min. :0.01938 Min. :0.00000
## 1st Qu.: 420.3 1st Qu.:0.08637 1st Qu.:0.06492 1st Qu.:0.02956
## Median : 551.1 Median :0.09587 Median :0.09263 Median :0.06154
## Mean : 654.9 Mean :0.09636 Mean :0.10434 Mean :0.08880
## 3rd Qu.: 782.7 3rd Qu.:0.10530 3rd Qu.:0.13040 3rd Qu.:0.13070
## Max. :2501.0 Max. :0.16340 Max. :0.34540 Max. :0.42680
## concave.points_mean symmetry_mean fractal_dimension_mean radius_se
## Min. :0.00000 Min. :0.1060 Min. :0.04996 Min. :0.1115
## 1st Qu.:0.02031 1st Qu.:0.1619 1st Qu.:0.05770 1st Qu.:0.2324
## Median :0.03350 Median :0.1792 Median :0.06154 Median :0.3242
## Mean :0.04892 Mean :0.1812 Mean :0.06280 Mean :0.4052
## 3rd Qu.:0.07400 3rd Qu.:0.1957 3rd Qu.:0.06612 3rd Qu.:0.4789
## Max. :0.20120 Max. :0.3040 Max. :0.09744 Max. :2.8730
## texture_se perimeter_se area_se smoothness_se
## Min. :0.3602 Min. : 0.757 Min. : 6.802 Min. :0.001713
## 1st Qu.:0.8339 1st Qu.: 1.606 1st Qu.: 17.850 1st Qu.:0.005169
## Median :1.1080 Median : 2.287 Median : 24.530 Median :0.006380
## Mean :1.2169 Mean : 2.866 Mean : 40.337 Mean :0.007041
## 3rd Qu.:1.4740 3rd Qu.: 3.357 3rd Qu.: 45.190 3rd Qu.:0.008146
## Max. :4.8850 Max. :21.980 Max. :542.200 Max. :0.031130
## compactness_se concavity_se concave.points_se symmetry_se
## Min. :0.002252 Min. :0.00000 Min. :0.000000 Min. :0.007882
## 1st Qu.:0.013080 1st Qu.:0.01509 1st Qu.:0.007638 1st Qu.:0.015160
## Median :0.020450 Median :0.02589 Median :0.010930 Median :0.018730
## Mean :0.025478 Mean :0.03189 Mean :0.011796 Mean :0.020542
## 3rd Qu.:0.032450 3rd Qu.:0.04205 3rd Qu.:0.014710 3rd Qu.:0.023480
## Max. :0.135400 Max. :0.39600 Max. :0.052790 Max. :0.078950
## fractal_dimension_se radius_worst texture_worst perimeter_worst
## Min. :0.0008948 Min. : 7.93 Min. :12.02 Min. : 50.41
## 1st Qu.:0.0022480 1st Qu.:13.01 1st Qu.:21.08 1st Qu.: 84.11
## Median :0.0031870 Median :14.97 Median :25.41 Median : 97.66
## Mean :0.0037949 Mean :16.27 Mean :25.68 Mean :107.26
## 3rd Qu.:0.0045580 3rd Qu.:18.79 3rd Qu.:29.72 3rd Qu.:125.40
## Max. :0.0298400 Max. :36.04 Max. :49.54 Max. :251.20
## area_worst smoothness_worst compactness_worst concavity_worst
## Min. : 185.2 Min. :0.07117 Min. :0.02729 Min. :0.0000
## 1st Qu.: 515.3 1st Qu.:0.11660 1st Qu.:0.14720 1st Qu.:0.1145
## Median : 686.5 Median :0.13130 Median :0.21190 Median :0.2267
## Mean : 880.6 Mean :0.13237 Mean :0.25427 Mean :0.2722
## 3rd Qu.:1084.0 3rd Qu.:0.14600 3rd Qu.:0.33910 3rd Qu.:0.3829
## Max. :4254.0 Max. :0.22260 Max. :1.05800 Max. :1.2520
## concave.points_worst symmetry_worst fractal_dimension_worst
## Min. :0.00000 Min. :0.1565 Min. :0.05504
## 1st Qu.:0.06493 1st Qu.:0.2504 1st Qu.:0.07146
## Median :0.09993 Median :0.2822 Median :0.08004
## Mean :0.11461 Mean :0.2901 Mean :0.08395
## 3rd Qu.:0.16140 3rd Qu.:0.3179 3rd Qu.:0.09208
## Max. :0.29100 Max. :0.6638 Max. :0.20750
wbcd %>% plot_missing()
#Insight into Breast Cancer Wisconsin- Data
ggplot(data = wbcd, aes(x = diagnosis, fill = diagnosis)) +
geom_bar()+
geom_text(stat='count', aes(label=..count..), vjust=-1) +
labs(title = 'Diagnosis of Breast Cancer',
subtitle = 'Most of the diagnosis (63%) are Benign',
caption = 'Data owned by the University of Wisconsin',
x = 'Diagnosis', y = 'Number of observations')
Over here, we will be using the mean perimeter and the mean radius observed from the center of the lump to the perimeter. This will reveal how both types of lumps look in relative size.
ggplot(data = wbcd,
aes(x = radius_mean, y = perimeter_mean, color = diagnosis)) +
geom_point() +
geom_hline(yintercept = 116.0, linetype = 'dashed', color = 'gray')+
geom_vline(xintercept = 18.00, linetype = 'dashed', color = 'gray')+
labs(title = 'Mean Perimeter and Mean Radius',
subtitle = 'Malignant lumps can get relatively bigger than benigns',
caption = 'Data owned by the University of Wisconsin',
x = 'Mean Radius', y = 'Mean Perimeter') +
annotate('text', x = 24, y = 150,
label = '45% of malignants are bigger than every observed benign',
size = 2.3, angle = 45)
Insights: Malignant lumps can get relatively bigger than benign lumps. This has the possibility of sparking up a hypothesis that malignant lumps begin as benigns.
ggplot(data = wbcd,
aes(x = texture_mean, y = smoothness_mean, color = diagnosis)) +
geom_point()+
geom_vline(xintercept = 18.84, linetype = 'dashed', color = 'gray') +
labs(title = 'Mean Texture and Smoothess of Lumps',
subtitle = 'Most benigns (66%) are below the median mean texture',
caption = 'Data owned by the University of Wisconsin',
x = 'Mean Texture', y = 'Mean Smoothness') +
annotate('text', label = 'median = 18.84', x = 22, y = 0.160,
size = 2.5)
Insights from Texture and Smoothness Visualization
Not a lot of variation can be seen in the mean smoothness of both diagnosis as they all seem to clustered from the bottom to the upper midsection of the plot. However we can observe that most of the malignants (66%) are skewed to the right side of the median. This connotes that malignant lumps display higher texture variation values than benigns.
ggplot(data = wbcd,
aes(x = compactness_mean, y = concavity_mean, color = diagnosis)) +
geom_point()+
geom_smooth() +
labs(title = 'Mean Compactness and Mean Concavity',
subtitle = 'Most benigns display less concavity and compactness',
caption = 'Data owned by the University of Wisconsin',
x = 'Mean Compactness', y = 'Mean Concavity')
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
Insight from Compactness and Concavity
There is a clear display of outliers within the data. However a visual analysis reveals that benign lumps tend to have low mean concavity and a low mean compactness. This can is manifested in the benigns being skewed towards the bottom left side of the graph. Notice that the malignants are displaying a wider range from low concavity and low compactness to high concavity and high compactness. This visualization suggests that benigns usually have low to medium severe concaves at the contours of the lumps however malignant lumps can display anywhere between low and very high concavity and compactness.
ggpairs(wbcd[,c(2:11)],)+ theme_bw()+
labs(title="Cancer Worst")+
theme(plot.title=element_text(face='bold',color='black',hjust=0.5,size=13))
#### (b)STANDARD ERROR
ggpairs(wbcd[,c(12:21)],)+ theme_bw()+
labs(title="Cancer Worst")+
theme(plot.title=element_text(face='bold',color='black',hjust=0.5,size=13))
ggpairs(wbcd[,c(22:31)],)+ theme_bw()+
labs(title="Cancer Worst")+
theme(plot.title=element_text(face='bold',color='black',hjust=0.5,size=13))
ggcorr(wbcd[,c(2:11)], name = "corr", label = TRUE)+
theme(legend.position="none")+
labs(title="Cancer Mean")+
theme(plot.title=element_text(face='bold',color='black',hjust=0.5,size=12))
ggcorr(wbcd[,c(12:21)], name = "corr", label = TRUE)+
theme(legend.position="none")+
labs(title="Cancer SE")+
theme(plot.title=element_text(face='bold',color='black',hjust=0.5,size=12))
ggcorr(wbcd[,c(22:31)], name = "corr", label = TRUE)+
theme(legend.position="none")+
labs(title="Cancer Worst")+
theme(plot.title=element_text(face='bold',color='black',hjust=0.5,size=12))
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
wbcd_pca <- transform(wbcd)
The cumulative proportion from PC1 to PC6 is about 88.7%. (above 85%) It means that PC1~PC6 can explain 88.7% of the whole data.
all_pca <- prcomp(wbcd_pca[,-1], cor=TRUE, scale = TRUE)
## Warning: In prcomp.default(wbcd_pca[, -1], cor = TRUE, scale = TRUE) :
## extra argument 'cor' will be disregarded
summary(all_pca)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 3.6444 2.3857 1.67867 1.40735 1.28403 1.09880 0.82172
## Proportion of Variance 0.4427 0.1897 0.09393 0.06602 0.05496 0.04025 0.02251
## Cumulative Proportion 0.4427 0.6324 0.72636 0.79239 0.84734 0.88759 0.91010
## PC8 PC9 PC10 PC11 PC12 PC13 PC14
## Standard deviation 0.69037 0.6457 0.59219 0.5421 0.51104 0.49128 0.39624
## Proportion of Variance 0.01589 0.0139 0.01169 0.0098 0.00871 0.00805 0.00523
## Cumulative Proportion 0.92598 0.9399 0.95157 0.9614 0.97007 0.97812 0.98335
## PC15 PC16 PC17 PC18 PC19 PC20 PC21
## Standard deviation 0.30681 0.28260 0.24372 0.22939 0.22244 0.17652 0.1731
## Proportion of Variance 0.00314 0.00266 0.00198 0.00175 0.00165 0.00104 0.0010
## Cumulative Proportion 0.98649 0.98915 0.99113 0.99288 0.99453 0.99557 0.9966
## PC22 PC23 PC24 PC25 PC26 PC27 PC28
## Standard deviation 0.16565 0.15602 0.1344 0.12442 0.09043 0.08307 0.03987
## Proportion of Variance 0.00091 0.00081 0.0006 0.00052 0.00027 0.00023 0.00005
## Cumulative Proportion 0.99749 0.99830 0.9989 0.99942 0.99969 0.99992 0.99997
## PC29 PC30
## Standard deviation 0.02736 0.01153
## Proportion of Variance 0.00002 0.00000
## Cumulative Proportion 1.00000 1.00000
The cumulative proportion from PC1 to PC3 is about 88.7%. (above 85%)
mean_pca <- prcomp(wbcd_pca[,c(2:11)], scale = TRUE)
summary(mean_pca)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 2.3406 1.5870 0.93841 0.7064 0.61036 0.35234 0.28299
## Proportion of Variance 0.5479 0.2519 0.08806 0.0499 0.03725 0.01241 0.00801
## Cumulative Proportion 0.5479 0.7997 0.88779 0.9377 0.97495 0.98736 0.99537
## PC8 PC9 PC10
## Standard deviation 0.18679 0.10552 0.01680
## Proportion of Variance 0.00349 0.00111 0.00003
## Cumulative Proportion 0.99886 0.99997 1.00000
The cumulative proportion from PC1 to PC4 is about 86.7%. (above 85%)
se_pca <- prcomp(wbcd_pca[,c(12:21)], scale = TRUE)
summary(se_pca)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 2.1779 1.4406 1.1245 0.77095 0.75991 0.57939 0.43512
## Proportion of Variance 0.4743 0.2075 0.1264 0.05944 0.05775 0.03357 0.01893
## Cumulative Proportion 0.4743 0.6819 0.8083 0.86774 0.92548 0.95905 0.97798
## PC8 PC9 PC10
## Standard deviation 0.3962 0.20436 0.14635
## Proportion of Variance 0.0157 0.00418 0.00214
## Cumulative Proportion 0.9937 0.99786 1.00000
The cumulative proportion from PC1 to PC3 is about 85.8%. (above 85%)
worst_pca <- prcomp(wbcd_pca[,c(22:31)], scale = TRUE)
summary(worst_pca)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 2.3869 1.4443 0.89597 0.73531 0.71741 0.42862 0.28959
## Proportion of Variance 0.5697 0.2086 0.08028 0.05407 0.05147 0.01837 0.00839
## Cumulative Proportion 0.5697 0.7783 0.85860 0.91267 0.96413 0.98251 0.99089
## PC8 PC9 PC10
## Standard deviation 0.26802 0.12343 0.06326
## Proportion of Variance 0.00718 0.00152 0.00040
## Cumulative Proportion 0.99808 0.99960 1.00000
Line lies at point PC6
screeplot(all_pca, type = "l", npcs = 15, main = "Screeplot of the first 10 PCs")
abline(h = 1, col="red", lty=5)
legend("topright", legend=c("Eigenvalue = 1"),
col=c("red"), lty=5, cex=0.6)
fviz_eig(all_pca, addlabels=TRUE, ylim=c(0,60), geom = c("bar", "line"), barfill = "pink", barcolor="grey",linecolor = "red", ncp=10)+
labs(title = "Cancer All Variances - PCA",
x = "Principal Components", y = "% of variances")
Line lies at point PC4
screeplot(mean_pca, type = "l", npcs = 15, main = "Screeplot of the first 10 PCs")
abline(h = 1, col="red", lty=5)
legend("topright", legend=c("Eigenvalue = 1"),
col=c("red"), lty=5, cex=0.6)
fviz_eig(mean_pca, addlabels=TRUE, ylim=c(0,60), geom = c("bar", "line"), barfill = "pink", barcolor="grey",linecolor = "red", ncp=10)+
labs(title = "Cancer Mean Variances - PCA",
x = "Principal Components", y = "% of variances")
Line lies at point PC4
screeplot(se_pca, type = "l", npcs = 15, main = "Screeplot of the first 10 PCs")
abline(h = 1, col="red", lty=5)
legend("topright", legend=c("Eigenvalue = 1"),
col=c("red"), lty=5, cex=0.6)
fviz_eig(se_pca, addlabels=TRUE, ylim=c(0,60), geom = c("bar", "line"), barfill = "pink", barcolor="grey",linecolor = "red", ncp=10)+
labs(title = "Cancer SE Variances - PCA",
x = "Principal Components", y = "% of variances")
Line lies at point PC4
screeplot(worst_pca, type = "l", npcs = 15, main = "Screeplot of the first 10 PCs")
abline(h = 1, col="red", lty=5)
legend("topright", legend=c("Eigenvalue = 1"),
col=c("red"), lty=5, cex=0.6)
fviz_eig(worst_pca, addlabels=TRUE, ylim=c(0,60), geom = c("bar", "line"), barfill = "pink", barcolor="grey",linecolor = "red", ncp=10)+
labs(title = "Cancer Worst Variances - PCA",
x = "Principal Components", y = "% of variances")
all_var <- get_pca_var(all_pca)
all_var
## Principal Component Analysis Results for variables
## ===================================================
## Name Description
## 1 "$coord" "Coordinates for the variables"
## 2 "$cor" "Correlations between variables and dimensions"
## 3 "$cos2" "Cos2 for the variables"
## 4 "$contrib" "contributions of the variables"
Correlation between variables and PCA
library("corrplot")
## corrplot 0.92 loaded
corrplot(all_var$cos2, is.corr=FALSE)
To highlight the most contributing variables for each components
corrplot(all_var$contrib, is.corr=FALSE)
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
p1 <- fviz_contrib(all_pca, choice="var", axes=1, fill="pink", color="grey", top=10)
p2 <- fviz_contrib(all_pca, choice="var", axes=2, fill="skyblue", color="grey", top=10)
grid.arrange(p1,p2,ncol=2)
mean_var <- get_pca_var(mean_pca)
mean_var
## Principal Component Analysis Results for variables
## ===================================================
## Name Description
## 1 "$coord" "Coordinates for the variables"
## 2 "$cor" "Correlations between variables and dimensions"
## 3 "$cos2" "Cos2 for the variables"
## 4 "$contrib" "contributions of the variables"
Correlation between variables and PCA
library("corrplot")
corrplot(mean_var$cos2, is.corr=FALSE)
To highlight the most contributing variables for each components
corrplot(mean_var$contrib, is.corr=FALSE)
library(gridExtra)
p1 <- fviz_contrib(mean_pca, choice="var", axes=1, fill="pink", color="grey", top=10)
p2 <- fviz_contrib(mean_pca, choice="var", axes=2, fill="skyblue", color="grey", top=10)
grid.arrange(p1,p2,ncol=2)
se_var <- get_pca_var(se_pca)
se_var
## Principal Component Analysis Results for variables
## ===================================================
## Name Description
## 1 "$coord" "Coordinates for the variables"
## 2 "$cor" "Correlations between variables and dimensions"
## 3 "$cos2" "Cos2 for the variables"
## 4 "$contrib" "contributions of the variables"
Correlation between variables and PCA
library("corrplot")
corrplot(se_var$cos2, is.corr=FALSE)
To highlight the most contributing variables for each components
corrplot(se_var$contrib, is.corr=FALSE)
library(gridExtra)
p1 <- fviz_contrib(se_pca, choice="var", axes=1, fill="pink", color="grey", top=10)
p2 <- fviz_contrib(se_pca, choice="var", axes=2, fill="skyblue", color="grey", top=10)
grid.arrange(p1,p2,ncol=2)
worst_var <- get_pca_var(worst_pca)
worst_var
## Principal Component Analysis Results for variables
## ===================================================
## Name Description
## 1 "$coord" "Coordinates for the variables"
## 2 "$cor" "Correlations between variables and dimensions"
## 3 "$cos2" "Cos2 for the variables"
## 4 "$contrib" "contributions of the variables"
Correlation between variables and PCA
library("corrplot")
corrplot(worst_var$cos2, is.corr=FALSE)
To highlight the most contributing variables for each components
corrplot(worst_var$contrib, is.corr=FALSE)
library(gridExtra)
p1 <- fviz_contrib(worst_pca, choice="var", axes=1, fill="pink", color="grey", top=10)
p2 <- fviz_contrib(worst_pca, choice="var", axes=2, fill="skyblue", color="grey", top=10)
grid.arrange(p1,p2,ncol=2)
fviz_pca_biplot(all_pca, col.ind = wbcd$diagnosis, col="black",
palette = "jco", geom = "point", repel=TRUE,
legend.title="Diagnosis", addEllipses = TRUE)
## Warning: ggrepel: 4 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
fviz_pca_biplot(mean_pca, col.ind = wbcd$diagnosis, col="black",
palette = "jco", geom = "point", repel=TRUE,
legend.title="Diagnosis", addEllipses = TRUE)
fviz_pca_biplot(se_pca, col.ind = wbcd$diagnosis, col="black",
palette = "jco", geom = "point", repel=TRUE,
legend.title="Diagnosis", addEllipses = TRUE)
fviz_pca_biplot(worst_pca, col.ind = wbcd$diagnosis, col="black",
palette = "jco", geom = "point", repel=TRUE,
legend.title="Diagnosis", addEllipses = TRUE)
test & train dataset for testing classification ML methods train dataset(70%), test dataset(30%)
nrows <- NROW(wbcd)
set.seed(218) ## fix random value
index <- sample(1:nrows, 0.7 * nrows) ## shuffle and divide
#train <- wbcd ## 569 test data (100%)
train <- wbcd[index,] ## 398 test data (70%)
test <- wbcd[-index,] ## 171 test data (30%)
prop.table(table(train$diagnosis)) #proportion of diagnosis (Benign / Malignant)
##
## Benign Malignant
## 0.6180905 0.3819095
prop.table(table(test$diagnosis))
##
## Benign Malignant
## 0.6491228 0.3508772
library(caret)
##
## Attaching package: 'caret'
## The following object is masked from 'package:survival':
##
## cluster
library(e1071)
##
## Attaching package: 'e1071'
## The following object is masked from 'package:Hmisc':
##
## impute
learn_nb <- naiveBayes(train[,-1], train$diagnosis)
pre_nb <- predict(learn_nb, test[,-1])
cm_nb <- confusionMatrix(pre_nb, test$diagnosis)
cm_nb
## Confusion Matrix and Statistics
##
## Reference
## Prediction Benign Malignant
## Benign 107 6
## Malignant 4 54
##
## Accuracy : 0.9415
## 95% CI : (0.8951, 0.9716)
## No Information Rate : 0.6491
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8706
##
## Mcnemar's Test P-Value : 0.7518
##
## Sensitivity : 0.9640
## Specificity : 0.9000
## Pos Pred Value : 0.9469
## Neg Pred Value : 0.9310
## Prevalence : 0.6491
## Detection Rate : 0.6257
## Detection Prevalence : 0.6608
## Balanced Accuracy : 0.9320
##
## 'Positive' Class : Benign
##
library(randomForest)
## randomForest 4.7-1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:gridExtra':
##
## combine
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
learn_rf <- randomForest(diagnosis~., data=train, ntree=500, proximity=T, importance=T)
pre_rf <- predict(learn_rf, test[,-1])
cm_rf <- confusionMatrix(pre_rf, test$diagnosis)
cm_rf
## Confusion Matrix and Statistics
##
## Reference
## Prediction Benign Malignant
## Benign 111 4
## Malignant 0 56
##
## Accuracy : 0.9766
## 95% CI : (0.9412, 0.9936)
## No Information Rate : 0.6491
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9478
##
## Mcnemar's Test P-Value : 0.1336
##
## Sensitivity : 1.0000
## Specificity : 0.9333
## Pos Pred Value : 0.9652
## Neg Pred Value : 1.0000
## Prevalence : 0.6491
## Detection Rate : 0.6491
## Detection Prevalence : 0.6725
## Balanced Accuracy : 0.9667
##
## 'Positive' Class : Benign
##
library(rpart)
learn_rp <- rpart(diagnosis~.,data=train,control=rpart.control(minsplit=2))
pre_rp <- predict(learn_rp, test[,-1], type="class")
cm_rp <- confusionMatrix(pre_rp, test$diagnosis)
cm_rp
## Confusion Matrix and Statistics
##
## Reference
## Prediction Benign Malignant
## Benign 108 5
## Malignant 3 55
##
## Accuracy : 0.9532
## 95% CI : (0.9099, 0.9796)
## No Information Rate : 0.6491
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8965
##
## Mcnemar's Test P-Value : 0.7237
##
## Sensitivity : 0.9730
## Specificity : 0.9167
## Pos Pred Value : 0.9558
## Neg Pred Value : 0.9483
## Prevalence : 0.6491
## Detection Rate : 0.6316
## Detection Prevalence : 0.6608
## Balanced Accuracy : 0.9448
##
## 'Positive' Class : Benign
##
library(rpart)
library(ada)
control <- rpart.control(cp = -1, maxdepth = 14,maxcompete = 1,xval = 0)
learn_ada <- ada(diagnosis~., data = train, test.x = train[,-1], test.y = train[,1], type = "gentle", control = control, iter = 70)
pre_ada <- predict(learn_ada, test[,-1])
cm_ada <- confusionMatrix(pre_ada, test$diagnosis)
cm_ada
## Confusion Matrix and Statistics
##
## Reference
## Prediction Benign Malignant
## Benign 110 2
## Malignant 1 58
##
## Accuracy : 0.9825
## 95% CI : (0.9496, 0.9964)
## No Information Rate : 0.6491
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9613
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9910
## Specificity : 0.9667
## Pos Pred Value : 0.9821
## Neg Pred Value : 0.9831
## Prevalence : 0.6491
## Detection Rate : 0.6433
## Detection Prevalence : 0.6550
## Balanced Accuracy : 0.9788
##
## 'Positive' Class : Benign
##
learn_svm <- svm(diagnosis~., data=train)
pre_svm <- predict(learn_svm, test[,-1])
cm_svm <- confusionMatrix(pre_svm, test$diagnosis)
cm_svm
## Confusion Matrix and Statistics
##
## Reference
## Prediction Benign Malignant
## Benign 109 1
## Malignant 2 59
##
## Accuracy : 0.9825
## 95% CI : (0.9496, 0.9964)
## No Information Rate : 0.6491
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9616
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9820
## Specificity : 0.9833
## Pos Pred Value : 0.9909
## Neg Pred Value : 0.9672
## Prevalence : 0.6491
## Detection Rate : 0.6374
## Detection Prevalence : 0.6433
## Balanced Accuracy : 0.9827
##
## 'Positive' Class : Benign
##
gamma <- seq(0,0.1,0.005)
cost <- 2^(0:5)
parms <- expand.grid(cost=cost, gamma=gamma) ## 231
acc_test <- numeric()
accuracy1 <- NULL; accuracy2 <- NULL
for(i in 1:NROW(parms)){
learn_svm <- svm(diagnosis~., data=train, gamma=parms$gamma[i], cost=parms$cost[i])
pre_svm <- predict(learn_svm, test[,-1])
accuracy1 <- confusionMatrix(pre_svm, test$diagnosis)
accuracy2[i] <- accuracy1$overall[1]
}
acc <- data.frame(p= seq(1,NROW(parms)), cnt = accuracy2)
opt_p <- subset(acc, cnt==max(cnt))[1,]
sub <- paste("Optimal number of parameter is", opt_p$p, "(accuracy :", opt_p$cnt,") in SVM")
library(highcharter)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
## Highcharts (www.highcharts.com) is a Highsoft software product which is
## not free for commercial and Governmental use
hchart(acc, 'line', hcaes(p, cnt)) %>%
hc_title(text = "Accuracy With Varying Parameters (SVM)") %>%
hc_subtitle(text = sub) %>%
hc_add_theme(hc_theme_google()) %>%
hc_xAxis(title = list(text = "Number of Parameters")) %>%
hc_yAxis(title = list(text = "Accuracy"))
learn_imp_svm <- svm(diagnosis~., data=train, cost=parms$cost[opt_p$p], gamma=parms$gamma[opt_p$p])
pre_imp_svm <- predict(learn_imp_svm, test[,-1])
cm_imp_svm <- confusionMatrix(pre_imp_svm, test$diagnosis)
cm_imp_svm
## Confusion Matrix and Statistics
##
## Reference
## Prediction Benign Malignant
## Benign 110 1
## Malignant 1 59
##
## Accuracy : 0.9883
## 95% CI : (0.9584, 0.9986)
## No Information Rate : 0.6491
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9743
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9910
## Specificity : 0.9833
## Pos Pred Value : 0.9910
## Neg Pred Value : 0.9833
## Prevalence : 0.6491
## Detection Rate : 0.6433
## Detection Prevalence : 0.6491
## Balanced Accuracy : 0.9872
##
## 'Positive' Class : Benign
##
col <- c("#ed3b3b", "#0099ff")
par(mfrow=c(2,3))
fourfoldplot(cm_nb$table, color = col, conf.level = 0, margin = 1, main=paste("NaiveBayes (",round(cm_nb$overall[1]*100),"%)",sep=""))
fourfoldplot(cm_rp$table, color = col, conf.level = 0, margin = 1, main=paste("RPart (",round(cm_rp$overall[1]*100),"%)",sep=""))
fourfoldplot(cm_rf$table, color = col, conf.level = 0, margin = 1, main=paste("RandomForest (",round(cm_rf$overall[1]*100),"%)",sep=""))
fourfoldplot(cm_ada$table, color = col, conf.level = 0, margin = 1, main=paste("AdaBoost (",round(cm_ada$overall[1]*100),"%)",sep=""))
fourfoldplot(cm_svm$table, color = col, conf.level = 0, margin = 1, main=paste("SVM (",round(cm_svm$overall[1]*100),"%)",sep=""))
fourfoldplot(cm_imp_svm$table, color = col, conf.level = 0, margin = 1, main=paste("Tune SVM (",round(cm_imp_svm$overall[1]*100),"%)",sep=""))
opt_predict <- c( cm_nb$overall[1], cm_rp$overall[1],cm_rf$overall[1],cm_ada$overall[1],cm_svm$overall[1],cm_imp_svm$overall[1])
names(opt_predict) <- c("Naive Bayes","RPart","Random Forest","AdaBoost","SVM","SVM Tune")
best_predict_model <- subset(opt_predict, opt_predict==max(opt_predict))
best_predict_model
## SVM Tune
## 0.9883041